Assignment 03

Author
Affiliation

Dakota Alder - dkalder

Boston University

Published

September 22, 2025

Modified

September 25, 2025

1 Loading the Dataset

import pandas as pd
import plotly.express as px
import plotly.io as pio
from pyspark.sql import SparkSession
import re
import numpy as np
import plotly.graph_objects as go
from pyspark.sql.functions import col, split, explode, regexp_replace, transform, when
from pyspark.sql import functions as F
from pyspark.sql.functions import col, monotonically_increasing_id

np.random.seed(42)

pio.renderers.default = "notebook"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("data/lightcast_job_postings.csv")
df.createOrReplaceTempView("job_postings")

# Show Schema and Sample Data
#print("---This is Diagnostic check, No need to print it in the final doc---")

#df.printSchema() # comment this line when rendering the submission
#df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/25 03:19:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 1:>                                                          (0 + 1) / 1]                                                                                25/09/25 03:19:35 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

2 Data Preparation

[Stage 2:>                                                          (0 + 1) / 1]                                                                                [Stage 3:>                                                          (0 + 1) / 1]                                                                                [Stage 4:>                                                          (0 + 1) / 1]                                                                                
Medians: 87295.0 130042.0 115024.0
[Stage 5:>                                                          (0 + 1) / 1]                                                                                
Data Cleaning Complete. Rows retained: 72498

3 Salary Distribution by Employment Type

[Stage 6:>                                                          (0 + 1) / 1]                                                                                

4 Salary Distribution by Industry

#Question 2 Code

#Select Industry and Salary Columns
pdf = df.select("NAICS2_NAME", "SALARY_FROM").toPandas()


#Create box plot with Horizontal grid lines
fig = px.box(
  pdf,
  x="NAICS2_NAME",
  y="SALARY_FROM",
  title="Salary Distribution by Industry",
  color_discrete_sequence=["blue"],
  boxmode="group",
  points="all",
)



fig.update_layout(
  title=dict(
      text="Salary Distribution by Industry",
      font=dict(size=25, family="Arial", color="black", weight="bold")
  ),
  xaxis=dict(
    title=dict(text="Industry", font=dict(size=24, family="Arial", color="black", weight="bold")),
    tickangle=45,
    tickfont=dict(size=12, family="Arial", color="black",weight="bold"),
    showline=True,
    linewidth=2,
    linecolor="black",
    mirror=True,
    showgrid=False,
    categoryorder="array",
    categoryarray=sorted_employment_types.tolist()
  ),
  yaxis=dict(
    title=dict(text="Salary (K $)", font=dict(size=24, family="Arial", color="black", weight="bold")),
    tickvals=[0,50000,100000,150000,200000,250000,300000,350000,400000,450000,500000],
    ticktext=["0","50K","100K","150K","200K","250K","300K","350K","400K","450K","500K"],
    tickfont=dict(size=18, family="Arial", color="black",weight="bold"),
    showline=True,
    linewidth=2,
    linecolor="black",
    mirror=True,
    showgrid=False,
    gridcolor="lightgray",
    gridwidth=.5
  ),
  font=dict(family="Arial", size=16, color="black"),
  boxgap=0.7,
  plot_bgcolor="white",
  paper_bgcolor="white",
  showlegend=False,
  height=800,
  width=1000,
)
fig.show()
fig.write_html("output/Q2.html")
#fig.write_image("output/Q2.svg", width=850, height=500, scale=1)
[Stage 7:>                                                          (0 + 1) / 1]                                                                                

5 Question 2: Salary Analysis by ONET Occupation Type

#Query the required data with Spark SQL

onet_type = spark.sql("""
  SELECT LOT_OCCUPATION_NAME AS Occupation_Name, PERCENTILE(SALARY, 0.5) As Median_Salary, COUNT(*) As Job_Postings
  FROM job_postings
  GROUP BY LOT_OCCUPATION_NAME
  ORDER BY Job_Postings DESC
  LIMIT 10
  """)

onet_pd = onet_type.toPandas()

fig = px.scatter(
  onet_pd,
  x="Occupation_Name",
  y="Median_Salary",
  size="Job_Postings",
  title="Salary Analysis by Occupation Name (Bubble Chart)",
  labels={
    "Occupation_Name": "Occupation Name",
    "Median_Salary": "Median Salary",
    "Job_Postings": "Number of Job Postings"
  },
  hover_name="Occupation_Name",
  size_max=60,
  width=1000,
  height=600,
  color="Job_Postings",
  color_continuous_scale="Plasma"
)

fig.update_layout(
  font_family="Arial",
  font_size=12,
  title_font_size=24,
  xaxis_title="Occupation Name",
  yaxis_title="Median Salary",
  plot_bgcolor="white",
  paper_bgcolor="#f0fff0",
  xaxis=dict(
    tickangle=-45,
    showline=True,
    linecolor="black"
  ),
  yaxis=dict(
    showline=True,
    linecolor="black"
  )
)
fig.show()
fig.write_html("output/Q3.html")
[Stage 8:>                                                          (0 + 1) / 1]                                                                                

6 Salary by Education Level

#Create 2 groups of Education Levels
lower_deg = ["Bachelor's", "Associate", "GED", "No Education Listed", "High School"]
higher_deg = ["Master's degree", "PhD or professional degree"]

df = df.withColumn(
  "EDU_GROUP",
  when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}" for deg in lower_deg])), "Bachelor's or lower")
  .when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}" for deg in higher_deg])), "Master's or PhD")
  .otherwise("Other")
)


df = df.withColumn("Average_Salary", col("Average_Salary").cast("float"))

df = df.filter(
  col("MAX_YEARS_EXPERIENCE").isNotNull() &
  col("Average_Salary").isNotNull() &
  (col("MAX_YEARS_EXPERIENCE") > 0) &
  (col("Average_Salary") > 0)
)


df_filtered = df.filter(col("EDU_GROUP").isin("Bachelor's or lower", "Master's or PhD"))

edu_cols = [
  "EDU_GROUP",
  "MAX_YEARS_EXPERIENCE",
  "Average_Salary",
  "LOT_V6_SPECIALIZED_OCCUPATION_NAME"
]

df_filtered = df_filtered.select(*edu_cols)
edu_pd = df_filtered.toPandas()

fig = px.scatter(
  edu_pd,
  x="MAX_YEARS_EXPERIENCE",
  y="Average_Salary",
  color="EDU_GROUP",
  title="SALARY ANALYSIS BY EDU GROUP AND OCCUPATION",
  opacity=.7,
  color_discrete_sequence=["blue", "red"],
  hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"],
  width=1000,
  height=600,
)

fig.update_layout(
  font_family="Arial",
  font_size=12,
  title_font_size=24,
  xaxis_title="Max Years Experience",
  yaxis_title="Average Salary",
  legend_title="Education Group",
  plot_bgcolor="white",
  paper_bgcolor="#f0fff0",
  xaxis=dict(
    showline=True,
    linecolor="black",
    showgrid=True,
    gridcolor="lightgray",
    gridwidth=1,
    griddash="dash"
  ),
  yaxis=dict(
    showline=True,
    linecolor="black",
    showgrid=True,
    gridcolor="lightgray",
    gridwidth=1,
    griddash="dash"
  )
)
fig.show()
fig.write_html("output/Q4.html")
[Stage 11:>                                                         (0 + 1) / 1]                                                                                

7 Salary by Remote Work Type

# Group Remote Types into 3 groups
df = df.withColumn(
  "REMOTE_GROUP",
  when(col("REMOTE_TYPE_NAME").rlike("(?i)hybrid"), "Hybrid")
  .when(col("REMOTE_TYPE_NAME").rlike("(?i)remote"), "Remote")
  .otherwise("Onsite")
)

df = df.withColumn("Average_Salary", col("Average_Salary").cast("float"))

df = df.filter(
  col("MAX_YEARS_EXPERIENCE").isNotNull() &
  col("Average_Salary").isNotNull() &
  (col("MAX_YEARS_EXPERIENCE") > 0) &
  (col("Average_Salary") > 0)
)


remote_filtered = df.filter(col("REMOTE_GROUP").isin("Remote", "Hybrid", "Onsite"))

remote_cols = [
  "REMOTE_GROUP",
  "MAX_YEARS_EXPERIENCE",
  "Average_Salary",
  "LOT_V6_SPECIALIZED_OCCUPATION_NAME"
]

remote_filtered = remote_filtered.select(*remote_cols)
remote_pd = remote_filtered.toPandas()

fig = px.scatter(
  remote_pd,
  x="MAX_YEARS_EXPERIENCE",
  y="Average_Salary",
  color="REMOTE_GROUP",
  title="SALARY ANALYSIS BY REMOTE WORK TYPE",
  opacity=.7,
  color_discrete_sequence=["blue", "red", "green"],
  hover_data=["LOT_V6_SPECIALIZED_OCCUPATION_NAME"],
  width=1000,
  height=600,
)

fig.update_layout(
  font_family="Arial",
  font_size=12,
  title_font_size=24,
  xaxis_title="Max Years Experience",
  yaxis_title="Average Salary",
  legend_title="Remote Work Type",
  plot_bgcolor="white",
  paper_bgcolor="#f0fff0",
  xaxis=dict(
    showline=True,
    linecolor="black",
    showgrid=True,
    gridcolor="lightgray",
    gridwidth=1,
    griddash="dash"
  ),
  yaxis=dict(
    showline=True,
    linecolor="black",
    showgrid=True,
    gridcolor="lightgray",
    gridwidth=1,
    griddash="dash"
  )
)
fig.show()
fig.write_html("output/Q5.html")
[Stage 12:>                                                         (0 + 1) / 1]                                                                                

8 Salary Histograms for Remote Type Work

#Histograms

fig = px.histogram(
  remote_pd,
  x = "Average_Salary",
  color="REMOTE_GROUP",
  nbins=40,
  opacity=0.7,
  barmode="overlay",
  color_discrete_sequence=["blue","red","green"],
  title="Histogram of Salary Distribution"
)

fig.update_layout(
  xaxis_title="Average Salary",
  yaxis_title="Count",
  plot_bgcolor="white",
  paper_bgcolor="#f0fff0",
  xaxis=dict(
    showline=True,
    linecolor="black",
    showgrid=True,
    gridcolor="lightgray",
    gridwidth=1
  ),
  yaxis=dict(
    showline=True,
    linecolor="black",
    showgrid=True,
    gridcolor="lightgray",
    gridwidth=1
  )
)
fig.update_xaxes(range=[0, 500000])


fig.show()
fig.write_html("output/Q6.html")